{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 首先 import 必要的模块\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       1       0       0       0       0       0       0       0       0   \n",
       "1   2       0       0       0       0       0       0       0       1       0   \n",
       "2   3       0       0       0       0       0       0       0       1       0   \n",
       "3   4       1       0       0       1       6       1       5       0       0   \n",
       "4   5       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \\\n",
       "0  ...        1        0        0        0        0        0        0   \n",
       "1  ...        0        0        0        0        0        0        0   \n",
       "2  ...        0        0        0        0        0        0        0   \n",
       "3  ...        0        1        2        0        0        0        0   \n",
       "4  ...        1        0        0        0        0        1        0   \n",
       "\n",
       "   feat_92  feat_93   target  \n",
       "0        0        0  Class_1  \n",
       "1        0        0  Class_1  \n",
       "2        0        0  Class_1  \n",
       "3        0        0  Class_1  \n",
       "4        0        0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"Otto_train.csv\")\n",
    "train.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 采用train_test_split，从数据集中随机抽取10000条记录，用于之后模型的训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 10000 entries, 60773 to 14100\n",
      "Data columns (total 95 columns):\n",
      "id         10000 non-null int64\n",
      "feat_1     10000 non-null int64\n",
      "feat_2     10000 non-null int64\n",
      "feat_3     10000 non-null int64\n",
      "feat_4     10000 non-null int64\n",
      "feat_5     10000 non-null int64\n",
      "feat_6     10000 non-null int64\n",
      "feat_7     10000 non-null int64\n",
      "feat_8     10000 non-null int64\n",
      "feat_9     10000 non-null int64\n",
      "feat_10    10000 non-null int64\n",
      "feat_11    10000 non-null int64\n",
      "feat_12    10000 non-null int64\n",
      "feat_13    10000 non-null int64\n",
      "feat_14    10000 non-null int64\n",
      "feat_15    10000 non-null int64\n",
      "feat_16    10000 non-null int64\n",
      "feat_17    10000 non-null int64\n",
      "feat_18    10000 non-null int64\n",
      "feat_19    10000 non-null int64\n",
      "feat_20    10000 non-null int64\n",
      "feat_21    10000 non-null int64\n",
      "feat_22    10000 non-null int64\n",
      "feat_23    10000 non-null int64\n",
      "feat_24    10000 non-null int64\n",
      "feat_25    10000 non-null int64\n",
      "feat_26    10000 non-null int64\n",
      "feat_27    10000 non-null int64\n",
      "feat_28    10000 non-null int64\n",
      "feat_29    10000 non-null int64\n",
      "feat_30    10000 non-null int64\n",
      "feat_31    10000 non-null int64\n",
      "feat_32    10000 non-null int64\n",
      "feat_33    10000 non-null int64\n",
      "feat_34    10000 non-null int64\n",
      "feat_35    10000 non-null int64\n",
      "feat_36    10000 non-null int64\n",
      "feat_37    10000 non-null int64\n",
      "feat_38    10000 non-null int64\n",
      "feat_39    10000 non-null int64\n",
      "feat_40    10000 non-null int64\n",
      "feat_41    10000 non-null int64\n",
      "feat_42    10000 non-null int64\n",
      "feat_43    10000 non-null int64\n",
      "feat_44    10000 non-null int64\n",
      "feat_45    10000 non-null int64\n",
      "feat_46    10000 non-null int64\n",
      "feat_47    10000 non-null int64\n",
      "feat_48    10000 non-null int64\n",
      "feat_49    10000 non-null int64\n",
      "feat_50    10000 non-null int64\n",
      "feat_51    10000 non-null int64\n",
      "feat_52    10000 non-null int64\n",
      "feat_53    10000 non-null int64\n",
      "feat_54    10000 non-null int64\n",
      "feat_55    10000 non-null int64\n",
      "feat_56    10000 non-null int64\n",
      "feat_57    10000 non-null int64\n",
      "feat_58    10000 non-null int64\n",
      "feat_59    10000 non-null int64\n",
      "feat_60    10000 non-null int64\n",
      "feat_61    10000 non-null int64\n",
      "feat_62    10000 non-null int64\n",
      "feat_63    10000 non-null int64\n",
      "feat_64    10000 non-null int64\n",
      "feat_65    10000 non-null int64\n",
      "feat_66    10000 non-null int64\n",
      "feat_67    10000 non-null int64\n",
      "feat_68    10000 non-null int64\n",
      "feat_69    10000 non-null int64\n",
      "feat_70    10000 non-null int64\n",
      "feat_71    10000 non-null int64\n",
      "feat_72    10000 non-null int64\n",
      "feat_73    10000 non-null int64\n",
      "feat_74    10000 non-null int64\n",
      "feat_75    10000 non-null int64\n",
      "feat_76    10000 non-null int64\n",
      "feat_77    10000 non-null int64\n",
      "feat_78    10000 non-null int64\n",
      "feat_79    10000 non-null int64\n",
      "feat_80    10000 non-null int64\n",
      "feat_81    10000 non-null int64\n",
      "feat_82    10000 non-null int64\n",
      "feat_83    10000 non-null int64\n",
      "feat_84    10000 non-null int64\n",
      "feat_85    10000 non-null int64\n",
      "feat_86    10000 non-null int64\n",
      "feat_87    10000 non-null int64\n",
      "feat_88    10000 non-null int64\n",
      "feat_89    10000 non-null int64\n",
      "feat_90    10000 non-null int64\n",
      "feat_91    10000 non-null int64\n",
      "feat_92    10000 non-null int64\n",
      "feat_93    10000 non-null int64\n",
      "target     10000 non-null object\n",
      "dtypes: int64(94), object(1)\n",
      "memory usage: 7.3+ MB\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "train, _ = train_test_split(\n",
    "\t\t\t\t\ttrain, train_size=10000, random_state=44)\n",
    "\n",
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "93个整数型匿名特征\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征变换\n",
    "1. 取对数log1p（对线性模型很重要，单调变换树模型影响不大）\n",
    "2. tf-idf\n",
    "3. 原始特征组合（加减乘除。如果是计数特征，乘法表示“and”，更有意义（FM）；或者可采用GBDT做特征编码，实现更高阶特征组合；原始特征维数太高，也可以先用基础模型得到特征的重要性，对重要的特征再组合）\n",
    "4. t-SNE及PCA降维后的特征\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分开特征和标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 标签\n",
    "y_train = train['target']   #形式为Class_x\n",
    "\n",
    "#暂存id，其实id没什么用\n",
    "train_id = train['id']\n",
    "# drop ids and get labels\n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#保存特征名字\n",
    "columns_org = X_train.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. feat编码：log(x+1)\n",
    "原始特征feat_x看起来像计数特征，取log运算更接近人对数字的敏感度，更适合线性模型。\n",
    "同时也可以降低长维分布中大数值的影响，减弱长维分布的长尾性。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat_1_log</th>\n",
       "      <th>feat_2_log</th>\n",
       "      <th>feat_3_log</th>\n",
       "      <th>feat_4_log</th>\n",
       "      <th>feat_5_log</th>\n",
       "      <th>feat_6_log</th>\n",
       "      <th>feat_7_log</th>\n",
       "      <th>feat_8_log</th>\n",
       "      <th>feat_9_log</th>\n",
       "      <th>feat_10_log</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84_log</th>\n",
       "      <th>feat_85_log</th>\n",
       "      <th>feat_86_log</th>\n",
       "      <th>feat_87_log</th>\n",
       "      <th>feat_88_log</th>\n",
       "      <th>feat_89_log</th>\n",
       "      <th>feat_90_log</th>\n",
       "      <th>feat_91_log</th>\n",
       "      <th>feat_92_log</th>\n",
       "      <th>feat_93_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>2.564949</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>1.386294</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.302585</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.609438</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>2.944439</td>\n",
       "      <td>2.302585</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat_1_log  feat_2_log  feat_3_log  feat_4_log  feat_5_log  feat_6_log  \\\n",
       "0         0.0    0.693147    0.000000    0.000000         0.0         0.0   \n",
       "1         0.0    0.000000    0.000000    0.000000         0.0         0.0   \n",
       "2         0.0    0.000000    0.000000    0.000000         0.0         0.0   \n",
       "3         0.0    0.000000    0.693147    0.000000         0.0         0.0   \n",
       "4         0.0    0.693147    2.944439    2.302585         0.0         0.0   \n",
       "\n",
       "   feat_7_log  feat_8_log  feat_9_log  feat_10_log  ...  feat_84_log  \\\n",
       "0         0.0    0.000000    0.000000          0.0  ...          0.0   \n",
       "1         0.0    1.098612    0.000000          0.0  ...          0.0   \n",
       "2         0.0    0.000000    2.302585          0.0  ...          0.0   \n",
       "3         0.0    0.000000    1.609438          0.0  ...          0.0   \n",
       "4         0.0    0.000000    0.000000          0.0  ...          0.0   \n",
       "\n",
       "   feat_85_log  feat_86_log  feat_87_log  feat_88_log  feat_89_log  \\\n",
       "0          0.0     0.000000     0.000000          0.0     0.000000   \n",
       "1          0.0     0.000000     0.693147          0.0     0.000000   \n",
       "2          0.0     0.000000     0.000000          0.0     0.000000   \n",
       "3          0.0     0.693147     0.000000          0.0     0.000000   \n",
       "4          0.0     0.000000     0.000000          0.0     0.693147   \n",
       "\n",
       "   feat_90_log  feat_91_log  feat_92_log  feat_93_log  \n",
       "0     0.693147     2.564949     0.000000          0.0  \n",
       "1     0.000000     0.693147     1.386294          0.0  \n",
       "2     0.000000     0.000000     0.000000          0.0  \n",
       "3     0.000000     0.000000     0.000000          0.0  \n",
       "4     0.000000     0.000000     0.000000          0.0  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_log = np.log1p(X_train)\n",
    "\n",
    "#重新组成DataFrame\n",
    "feat_names = columns_org + \"_log\"\n",
    "X_train_log = pd.DataFrame(columns = feat_names, data = X_train_log.values)\n",
    "\n",
    "X_train_log.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. feat编码：TF-IDF\n",
    "原始特征feat_x看起来像计数特征，类似文本分析中词频特征的处理，TF-IDF可以突出对特别类别有贡献的低频词。\n",
    "这里原始特征已经是计数特征了，直接调用TfidfTransformer，将计数特征变成TF-IDF\n",
    "如果输入是原始文本，需要将计数功能（TF）和IDF功能集中在一起，用TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat_1_tfidf</th>\n",
       "      <th>feat_2_tfidf</th>\n",
       "      <th>feat_3_tfidf</th>\n",
       "      <th>feat_4_tfidf</th>\n",
       "      <th>feat_5_tfidf</th>\n",
       "      <th>feat_6_tfidf</th>\n",
       "      <th>feat_7_tfidf</th>\n",
       "      <th>feat_8_tfidf</th>\n",
       "      <th>feat_9_tfidf</th>\n",
       "      <th>feat_10_tfidf</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84_tfidf</th>\n",
       "      <th>feat_85_tfidf</th>\n",
       "      <th>feat_86_tfidf</th>\n",
       "      <th>feat_87_tfidf</th>\n",
       "      <th>feat_88_tfidf</th>\n",
       "      <th>feat_89_tfidf</th>\n",
       "      <th>feat_90_tfidf</th>\n",
       "      <th>feat_91_tfidf</th>\n",
       "      <th>feat_92_tfidf</th>\n",
       "      <th>feat_93_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.067479</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.063287</td>\n",
       "      <td>0.903245</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.108573</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.060903</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.083634</td>\n",
       "      <td>0.174977</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.865234</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.082846</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.339160</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.060041</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.031398</td>\n",
       "      <td>0.449345</td>\n",
       "      <td>0.21807</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.024199</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat_1_tfidf  feat_2_tfidf  feat_3_tfidf  feat_4_tfidf  feat_5_tfidf  \\\n",
       "0           0.0      0.067479      0.000000       0.00000           0.0   \n",
       "1           0.0      0.000000      0.000000       0.00000           0.0   \n",
       "2           0.0      0.000000      0.000000       0.00000           0.0   \n",
       "3           0.0      0.000000      0.082846       0.00000           0.0   \n",
       "4           0.0      0.031398      0.449345       0.21807           0.0   \n",
       "\n",
       "   feat_6_tfidf  feat_7_tfidf  feat_8_tfidf  feat_9_tfidf  feat_10_tfidf  ...  \\\n",
       "0           0.0           0.0      0.000000      0.000000            0.0  ...   \n",
       "1           0.0           0.0      0.108573      0.000000            0.0  ...   \n",
       "2           0.0           0.0      0.000000      0.865234            0.0  ...   \n",
       "3           0.0           0.0      0.000000      0.339160            0.0  ...   \n",
       "4           0.0           0.0      0.000000      0.000000            0.0  ...   \n",
       "\n",
       "   feat_84_tfidf  feat_85_tfidf  feat_86_tfidf  feat_87_tfidf  feat_88_tfidf  \\\n",
       "0            0.0            0.0       0.000000       0.000000            0.0   \n",
       "1            0.0            0.0       0.000000       0.060903            0.0   \n",
       "2            0.0            0.0       0.000000       0.000000            0.0   \n",
       "3            0.0            0.0       0.060041       0.000000            0.0   \n",
       "4            0.0            0.0       0.000000       0.000000            0.0   \n",
       "\n",
       "   feat_89_tfidf  feat_90_tfidf  feat_91_tfidf  feat_92_tfidf  feat_93_tfidf  \n",
       "0       0.000000       0.063287       0.903245       0.000000            0.0  \n",
       "1       0.000000       0.000000       0.083634       0.174977            0.0  \n",
       "2       0.000000       0.000000       0.000000       0.000000            0.0  \n",
       "3       0.000000       0.000000       0.000000       0.000000            0.0  \n",
       "4       0.024199       0.000000       0.000000       0.000000            0.0  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform counts to TFIDF features\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf = TfidfTransformer()\n",
    "\n",
    "#输出稀疏矩阵\n",
    "X_train_tfidf = tfidf.fit_transform(X_train).toarray()\n",
    "\n",
    "#重新组成DataFrame,为了可视化\n",
    "feat_names = columns_org + \"_tfidf\"\n",
    "X_train_tfidf = pd.DataFrame(columns = feat_names, data = X_train_tfidf)\n",
    "\n",
    "X_train_tfidf.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理\n",
    "由于数据极度稀疏，数据缩放应采用MinMaxScaler，使得变换后的数据继续保持稀疏。\n",
    "如果将特征看似词频这种特征，也可以不用缩放，每个样本用模长归一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 对原始数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_org = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_org = X_train.columns\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train = ms_org.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 对log数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_log = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_log = X_train_log.columns\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train_log = ms_log.fit_transform(X_train_log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 对tf-idf数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_tfidf = X_train_tfidf.columns\n",
    "\n",
    "# 构造输入特征的标准化器\n",
    "ms_tfidf = MinMaxScaler()\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train_tfidf = ms_tfidf.fit_transform(X_train_tfidf)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#保存原始特征\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "feat_names = columns_org\n",
    "train_org = pd.concat([pd.DataFrame(train_id).reset_index(drop=True), pd.DataFrame(columns = feat_names_org, data = X_train).reset_index(drop=True), pd.DataFrame(y).reset_index(drop=True)], axis = 1)\n",
    "train_org.to_csv(dpath +'Otto_FE_train_org.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#保存log特征变换结果\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "train_log = pd.concat([pd.DataFrame(train_id).reset_index(drop=True), pd.DataFrame(columns = feat_names_log, data = X_train_log), pd.DataFrame(y).reset_index(drop=True)], axis = 1)\n",
    "train_log.to_csv(dpath +'Otto_FE_train_log.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#保存tf-idf特征变换结果\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "train_tfidf = pd.concat([pd.DataFrame(train_id).reset_index(drop=True), pd.DataFrame(columns = feat_names_tfidf, data = X_train_tfidf), pd.DataFrame(y).reset_index(drop=True)], axis = 1)\n",
    "train_tfidf.to_csv(dpath +'Otto_FE_train_tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存特征编码过程中用到的模型，用于后续对测试数据的特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import _pickle as cPickle\n",
    "\n",
    "cPickle.dump(tfidf, open(dpath +\"tfidf.pkl\", 'wb'))\n",
    "\n",
    "cPickle.dump(ms_org, open(dpath +\"MinMaxSclaer_org.pkl\", 'wb'))\n",
    "cPickle.dump(ms_log, open(dpath +\"MinMaxSclaer_log.pkl\", 'wb'))\n",
    "cPickle.dump(ms_tfidf, open(dpath +\"MinMaxSclaer_tfidf.pkl\", 'wb'))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1_tfidf</th>\n",
       "      <th>feat_2_tfidf</th>\n",
       "      <th>feat_3_tfidf</th>\n",
       "      <th>feat_4_tfidf</th>\n",
       "      <th>feat_5_tfidf</th>\n",
       "      <th>feat_6_tfidf</th>\n",
       "      <th>feat_7_tfidf</th>\n",
       "      <th>feat_8_tfidf</th>\n",
       "      <th>feat_9_tfidf</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85_tfidf</th>\n",
       "      <th>feat_86_tfidf</th>\n",
       "      <th>feat_87_tfidf</th>\n",
       "      <th>feat_88_tfidf</th>\n",
       "      <th>feat_89_tfidf</th>\n",
       "      <th>feat_90_tfidf</th>\n",
       "      <th>feat_91_tfidf</th>\n",
       "      <th>feat_92_tfidf</th>\n",
       "      <th>feat_93_tfidf</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60774</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.114118</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.064774</td>\n",
       "      <td>0.919982</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>58063</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.110486</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.072534</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.085184</td>\n",
       "      <td>0.2132</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8088</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.911355</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>25937</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.102255</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.357239</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.063196</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>45311</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.053099</td>\n",
       "      <td>0.554614</td>\n",
       "      <td>0.247825</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.028413</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id  feat_1_tfidf  feat_2_tfidf  feat_3_tfidf  feat_4_tfidf  \\\n",
       "0  60774           0.0      0.114118      0.000000      0.000000   \n",
       "1  58063           0.0      0.000000      0.000000      0.000000   \n",
       "2   8088           0.0      0.000000      0.000000      0.000000   \n",
       "3  25937           0.0      0.000000      0.102255      0.000000   \n",
       "4  45311           0.0      0.053099      0.554614      0.247825   \n",
       "\n",
       "   feat_5_tfidf  feat_6_tfidf  feat_7_tfidf  feat_8_tfidf  feat_9_tfidf  ...  \\\n",
       "0           0.0           0.0           0.0      0.000000      0.000000  ...   \n",
       "1           0.0           0.0           0.0      0.110486      0.000000  ...   \n",
       "2           0.0           0.0           0.0      0.000000      0.911355  ...   \n",
       "3           0.0           0.0           0.0      0.000000      0.357239  ...   \n",
       "4           0.0           0.0           0.0      0.000000      0.000000  ...   \n",
       "\n",
       "   feat_85_tfidf  feat_86_tfidf  feat_87_tfidf  feat_88_tfidf  feat_89_tfidf  \\\n",
       "0            0.0       0.000000       0.000000            0.0       0.000000   \n",
       "1            0.0       0.000000       0.072534            0.0       0.000000   \n",
       "2            0.0       0.000000       0.000000            0.0       0.000000   \n",
       "3            0.0       0.063196       0.000000            0.0       0.000000   \n",
       "4            0.0       0.000000       0.000000            0.0       0.028413   \n",
       "\n",
       "   feat_90_tfidf  feat_91_tfidf  feat_92_tfidf  feat_93_tfidf   target  \n",
       "0       0.064774       0.919982         0.0000            0.0  Class_9  \n",
       "1       0.000000       0.085184         0.2132            0.0  Class_9  \n",
       "2       0.000000       0.000000         0.0000            0.0  Class_2  \n",
       "3       0.000000       0.000000         0.0000            0.0  Class_3  \n",
       "4       0.000000       0.000000         0.0000            0.0  Class_6  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "train = pd.read_csv(dpath +\"Otto_FE_train_tfidf.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "y_train = train['target']   \n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#用于存储pca变换后的特征\n",
    "train_id = train['id']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PCA降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "pca = PCA(n_components = 0.85)\n",
    "pca.fit(X_train)\n",
    "    \n",
    "# 在训练集和测试集降维 \n",
    "X_train_pca = pca.transform(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 绘制PCA维的方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAQI0lEQVR4nO3df6zdd13H8efLlg0cZoOuGF2Ht2TVUAKi1kICKm4yOocrxi7pQK3JTDWhCQYIdiaOUSFhxlD/YCY2brps6rZM0RtXnAvDHyE4eseA0Y2GS53sWsLubBlOs43C2z/Ot3o4u7f32/Xc3t3PfT6Sm/v9fr6f7zmfT3r6+n7u53zP56SqkCS163uWugGSpMVl0EtS4wx6SWqcQS9JjTPoJalxq5e6AaPOP//8mpiYWOpmSNKycv/99z9eVWvnOva8C/qJiQmmpqaWuhmStKwk+ff5jjl1I0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpcr6BPsiXJoSTTSXbPcfynk3w2yfEk20aO7Ujy5e5nx7gaLknqZ8FPxiZZBdwAvBmYAQ4kmayqh4aqfRX4NeC9I+e+FHg/sAko4P7u3GPjaf6zTey+67v2H/nw5Yv1VJK0LPQZ0W8GpqvqcFU9A9wGbB2uUFWPVNUXgO+MnPsW4J6qOtqF+z3AljG0W5LUU5+gvwB4dGh/pivro9e5SXYmmUoyNTs72/OhJUl99An6zFHW94tme51bVfuqalNVbVq7ds7F1yRJz1GfoJ8BLhzaXwcc6fn4p3OuJGkM+gT9AWBDkvVJzgK2A5M9H/9u4NIkL0nyEuDSrkySdIYsGPRVdRzYxSCgHwbuqKqDSfYkuQIgyU8mmQGuBP44ycHu3KPA7zG4WBwA9nRlkqQzpNcXj1TVfmD/SNm1Q9sHGEzLzHXuTcBNp9FGSdJp8JOxktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMb1CvokW5IcSjKdZPccx89Ocnt3/L4kE135C5LcnOTBJA8nuWa8zZckLWTBoE+yCrgBuAzYCFyVZONItauBY1V1EbAXuL4rvxI4u6peDfwE8BsnLgKSpDOjz4h+MzBdVYer6hngNmDrSJ2twM3d9p3AJUkCFHBOktXAi4BngG+OpeWSpF76BP0FwKND+zNd2Zx1quo48ASwhkHo/zfwNeCrwB9U1dHRJ0iyM8lUkqnZ2dlT7oQkaX59gj5zlFXPOpuBbwM/CKwH3pPkFc+qWLWvqjZV1aa1a9f2aJIkqa8+QT8DXDi0vw44Ml+dbprmXOAo8Hbg76vqW1X1GPApYNPpNlqS1F+foD8AbEiyPslZwHZgcqTOJLCj294G3FtVxWC65uIMnAO8HvjSeJouSepjwaDv5tx3AXcDDwN3VNXBJHuSXNFVuxFYk2QaeDdw4hbMG4AXA19kcMH406r6wpj7IEk6idV9KlXVfmD/SNm1Q9tPMbiVcvS8J+cqlySdOX4yVpIaZ9BLUuMMeklqnEEvSY0z6CWpcb3uumnBxO67nlX2yIcvX4KWSNKZ5Yhekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS43oFfZItSQ4lmU6ye47jZye5vTt+X5KJoWOvSfLpJAeTPJjkheNrviRpIQsGfZJVwA3AZcBG4KokG0eqXQ0cq6qLgL3A9d25q4Fbgd+sqlcBbwK+NbbWS5IW1GdEvxmYrqrDVfUMcBuwdaTOVuDmbvtO4JIkAS4FvlBVnweoqv+sqm+Pp+mSpD76BP0FwKND+zNd2Zx1quo48ASwBvhhoJLcneSzSd431xMk2ZlkKsnU7OzsqfZBknQSfYI+c5RVzzqrgTcC7+h+/2KSS55VsWpfVW2qqk1r167t0SRJUl99gn4GuHBofx1wZL463bz8ucDRrvyfqurxqvofYD/w46fbaElSf32C/gCwIcn6JGcB24HJkTqTwI5uextwb1UVcDfwmiTf210AfgZ4aDxNlyT1sXqhClV1PMkuBqG9Cripqg4m2QNMVdUkcCNwS5JpBiP57d25x5J8hMHFooD9VXXXIvVFkjSHBYMeoKr2M5h2GS67dmj7KeDKec69lcEtlpKkJeAnYyWpcQa9JDXOoJekxvWao2/ZxO5nvzf8yIcvX4KWSNLicEQvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNW/GfjJ2Pn5iV1ApH9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxvnFI6fILySRtNw4opekxvUK+iRbkhxKMp1k9xzHz05ye3f8viQTI8dfnuTJJO8dT7MlSX0tGPRJVgE3AJcBG4GrkmwcqXY1cKyqLgL2AtePHN8LfPz0mytJOlV9RvSbgemqOlxVzwC3AVtH6mwFbu627wQuSRKAJG8DDgMHx9NkSdKp6BP0FwCPDu3PdGVz1qmq48ATwJok5wC/DXzgZE+QZGeSqSRTs7OzfdsuSeqhT9BnjrLqWecDwN6qevJkT1BV+6pqU1VtWrt2bY8mSZL66nN75Qxw4dD+OuDIPHVmkqwGzgWOAq8DtiX5feA84DtJnqqqj552yyVJvfQJ+gPAhiTrgf8AtgNvH6kzCewAPg1sA+6tqgJ+6kSFJNcBTxryknRmLRj0VXU8yS7gbmAVcFNVHUyyB5iqqkngRuCWJNMMRvLbF7PRkqT+en0ytqr2A/tHyq4d2n4KuHKBx7juObRPknSa/GSsJDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxfPDImfiGJpOcrR/SS1DiDXpIaZ9BLUuOco19kzt1LWmqO6CWpcQa9JDXOoJekxhn0ktQ4g16SGuddN0vEu3EknSmO6CWpcQa9JDXOoJekxjlH/zzj3L2kcXNEL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNa5X0CfZkuRQkukku+c4fnaS27vj9yWZ6MrfnOT+JA92vy8eb/MlSQtZMOiTrAJuAC4DNgJXJdk4Uu1q4FhVXQTsBa7vyh8HfqGqXg3sAG4ZV8MlSf30Wb1yMzBdVYcBktwGbAUeGqqzFbiu274T+GiSVNUDQ3UOAi9McnZVPX3aLV+BRle2dFVLSX30mbq5AHh0aH+mK5uzTlUdB54A1ozU+SXggblCPsnOJFNJpmZnZ/u2XZLUQ58RfeYoq1Opk+RVDKZzLp3rCapqH7APYNOmTaOPrQW4hr2kk+kT9DPAhUP764Aj89SZSbIaOBc4CpBkHfAx4Fer6iun3WL15gVAEvSbujkAbEiyPslZwHZgcqTOJIM3WwG2AfdWVSU5D7gLuKaqPjWuRkuS+lsw6Ls5913A3cDDwB1VdTDJniRXdNVuBNYkmQbeDZy4BXMXcBHwu0k+1/28bOy9kCTNq9d3xlbVfmD/SNm1Q9tPAVfOcd4HgQ+eZhslSafBLwdfgZy7l1YWl0CQpMY5otf/mW+k718A0vLmiF6SGmfQS1LjnLrRc+aUjrQ8OKKXpMYZ9JLUOINekhrnHL3Gzrl76fnFoNcZ4wVAWhoGvZ4X5vv2LC8O0ukz6LUseQGQ+jPo1RQvANKzGfRaEbwAaCUz6LWiuZCbVgKDXjoFXhi0HBn00iLzjiItNYNeep451b8a/CtDCzHopRXGC8PKY9BLOqmTXQCclloeDHpJZ8xiT0t5gZmbQS+pec/lwjDfXyvLkUEvSadgOf714Xr0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMb1CvokW5IcSjKdZPccx89Ocnt3/L4kE0PHrunKDyV5y/iaLknqY8GgT7IKuAG4DNgIXJVk40i1q4FjVXURsBe4vjt3I7AdeBWwBfij7vEkSWdInxH9ZmC6qg5X1TPAbcDWkTpbgZu77TuBS5KkK7+tqp6uqn8DprvHkySdIamqk1dItgFbqurXu/1fAV5XVbuG6nyxqzPT7X8FeB1wHfCvVXVrV34j8PGqunPkOXYCO7vdHwEOnX7XOB94fAyPs5zY55XBPq8cp9LvH6qqtXMd6LOoWeYoG706zFenz7lU1T5gX4+29JZkqqo2jfMxn+/s88pgn1eOcfW7z9TNDHDh0P464Mh8dZKsBs4FjvY8V5K0iPoE/QFgQ5L1Sc5i8Obq5EidSWBHt70NuLcGc0KTwPburpz1wAbgM+NpuiSpjwWnbqrqeJJdwN3AKuCmqjqYZA8wVVWTwI3ALUmmGYzkt3fnHkxyB/AQcBx4Z1V9e5H6MmqsU0HLhH1eGezzyjGWfi/4ZqwkaXnzk7GS1DiDXpIa11zQL7RcQyuS3JTkse4zDCfKXprkniRf7n6/ZCnbOG5JLkzyySQPJzmY5F1debP9TvLCJJ9J8vmuzx/oytd3y418uVt+5Kylbuu4JVmV5IEkf9ftN93nJI8keTDJ55JMdWVjeW03FfQ9l2toxZ8xWFZi2G7gE1W1AfhEt9+S48B7quqVwOuBd3b/vi33+2ng4qr6UeC1wJYkr2ewzMjers/HGCxD0pp3AQ8P7a+EPv9sVb126N75sby2mwp6+i3X0ISq+mcGdzgNG16K4mbgbWe0UYusqr5WVZ/ttv+LQQhcQMP9roEnu90XdD8FXMxguRForM8ASdYBlwN/0u2Hxvs8j7G8tlsL+guAR4f2Z7qyleL7q+prMAhF4GVL3J5F062Q+mPAfTTe724K43PAY8A9wFeAb1TV8a5Ki6/zPwTeB3yn219D+30u4B+S3N8tCwNjem33WQJhOem15IKWtyQvBv4K+K2q+uZgsNeu7rMnr01yHvAx4JVzVTuzrVo8Sd4KPFZV9yd504niOao20+fOG6rqSJKXAfck+dK4Hri1Ef1KX3Lh60l+AKD7/dgSt2fskryAQcj/eVX9dVfcfL8BquobwD8yeH/ivG65EWjvdf4G4IokjzCYfr2YwQi/5T5TVUe6348xuKBvZkyv7daCvs9yDS0bXopiB/C3S9iWsevmaW8EHq6qjwwdarbfSdZ2I3mSvAj4OQbvTXySwXIj0Fifq+qaqlpXVRMM/g/fW1XvoOE+Jzknyfed2AYuBb7ImF7bzX0yNsnPM7j6n1iu4UNL3KRFkeQvgTcxWMb068D7gb8B7gBeDnwVuLKqRt+wXbaSvBH4F+BB/n/u9ncYzNM32e8kr2HwJtwqBgOzO6pqT5JXMBjtvhR4APjlqnp66Vq6OLqpm/dW1Vtb7nPXt491u6uBv6iqDyVZwxhe280FvSTpu7U2dSNJGmHQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMb9L8xeBfNuf6KFAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#保存PCA特征变换结果\n",
    "n_components = pca.n_components_\n",
    "feat_names_pca = []\n",
    "for i in range(n_components):\n",
    "    feat_names_pca.append(\"pca_\" + str(i))\n",
    "\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "train_pca = pd.concat([train_id, pd.DataFrame(columns = feat_names_pca, data = X_train_pca), y], axis = 1)\n",
    "train_pca.to_csv(dpath +'Otto_FE_train_PCA.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存特征编码过程中用到的模型，用于后续对测试数据的特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import _pickle as cPickle\n",
    "\n",
    "cPickle.dump(pca, open(dpath + \"pca.pkl\", 'wb'))\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RBF SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>pca_0</th>\n",
       "      <th>pca_1</th>\n",
       "      <th>pca_2</th>\n",
       "      <th>pca_3</th>\n",
       "      <th>pca_4</th>\n",
       "      <th>pca_5</th>\n",
       "      <th>pca_6</th>\n",
       "      <th>pca_7</th>\n",
       "      <th>pca_8</th>\n",
       "      <th>...</th>\n",
       "      <th>pca_41</th>\n",
       "      <th>pca_42</th>\n",
       "      <th>pca_43</th>\n",
       "      <th>pca_44</th>\n",
       "      <th>pca_45</th>\n",
       "      <th>pca_46</th>\n",
       "      <th>pca_47</th>\n",
       "      <th>pca_48</th>\n",
       "      <th>pca_49</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60774</td>\n",
       "      <td>0.098996</td>\n",
       "      <td>0.180128</td>\n",
       "      <td>-0.068552</td>\n",
       "      <td>-0.151364</td>\n",
       "      <td>-0.052621</td>\n",
       "      <td>-0.171841</td>\n",
       "      <td>-0.052340</td>\n",
       "      <td>-0.004493</td>\n",
       "      <td>-0.124143</td>\n",
       "      <td>...</td>\n",
       "      <td>0.330419</td>\n",
       "      <td>0.142985</td>\n",
       "      <td>-0.090690</td>\n",
       "      <td>0.039132</td>\n",
       "      <td>-0.218198</td>\n",
       "      <td>0.355507</td>\n",
       "      <td>0.019291</td>\n",
       "      <td>-0.111644</td>\n",
       "      <td>0.076873</td>\n",
       "      <td>Class_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>58063</td>\n",
       "      <td>0.051463</td>\n",
       "      <td>0.085689</td>\n",
       "      <td>-0.035412</td>\n",
       "      <td>-0.177042</td>\n",
       "      <td>-0.086306</td>\n",
       "      <td>-0.057775</td>\n",
       "      <td>-0.145977</td>\n",
       "      <td>0.062874</td>\n",
       "      <td>-0.020713</td>\n",
       "      <td>...</td>\n",
       "      <td>0.017201</td>\n",
       "      <td>0.033115</td>\n",
       "      <td>0.124033</td>\n",
       "      <td>-0.082698</td>\n",
       "      <td>-0.016145</td>\n",
       "      <td>-0.001503</td>\n",
       "      <td>0.042375</td>\n",
       "      <td>0.148873</td>\n",
       "      <td>-0.189273</td>\n",
       "      <td>Class_9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8088</td>\n",
       "      <td>-0.050043</td>\n",
       "      <td>0.491164</td>\n",
       "      <td>0.020679</td>\n",
       "      <td>0.463688</td>\n",
       "      <td>0.124514</td>\n",
       "      <td>0.247532</td>\n",
       "      <td>-0.088920</td>\n",
       "      <td>-0.123791</td>\n",
       "      <td>-0.075584</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.066895</td>\n",
       "      <td>-0.005588</td>\n",
       "      <td>-0.084927</td>\n",
       "      <td>-0.120549</td>\n",
       "      <td>0.104887</td>\n",
       "      <td>-0.052950</td>\n",
       "      <td>0.065886</td>\n",
       "      <td>-0.009710</td>\n",
       "      <td>0.167165</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>25937</td>\n",
       "      <td>0.113138</td>\n",
       "      <td>0.101136</td>\n",
       "      <td>0.022262</td>\n",
       "      <td>0.490474</td>\n",
       "      <td>0.083402</td>\n",
       "      <td>0.317217</td>\n",
       "      <td>0.020204</td>\n",
       "      <td>0.241452</td>\n",
       "      <td>0.033521</td>\n",
       "      <td>...</td>\n",
       "      <td>0.032577</td>\n",
       "      <td>-0.044332</td>\n",
       "      <td>-0.095617</td>\n",
       "      <td>-0.126468</td>\n",
       "      <td>-0.115677</td>\n",
       "      <td>-0.037080</td>\n",
       "      <td>0.082948</td>\n",
       "      <td>0.002218</td>\n",
       "      <td>0.052820</td>\n",
       "      <td>Class_3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>45311</td>\n",
       "      <td>0.587813</td>\n",
       "      <td>-0.492328</td>\n",
       "      <td>-0.099530</td>\n",
       "      <td>0.238831</td>\n",
       "      <td>0.166449</td>\n",
       "      <td>-0.072206</td>\n",
       "      <td>-0.066623</td>\n",
       "      <td>-0.124674</td>\n",
       "      <td>0.113983</td>\n",
       "      <td>...</td>\n",
       "      <td>0.049908</td>\n",
       "      <td>-0.036065</td>\n",
       "      <td>-0.117997</td>\n",
       "      <td>-0.014939</td>\n",
       "      <td>0.065577</td>\n",
       "      <td>0.130757</td>\n",
       "      <td>0.119902</td>\n",
       "      <td>0.090398</td>\n",
       "      <td>0.057985</td>\n",
       "      <td>Class_6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id     pca_0     pca_1     pca_2     pca_3     pca_4     pca_5  \\\n",
       "0  60774  0.098996  0.180128 -0.068552 -0.151364 -0.052621 -0.171841   \n",
       "1  58063  0.051463  0.085689 -0.035412 -0.177042 -0.086306 -0.057775   \n",
       "2   8088 -0.050043  0.491164  0.020679  0.463688  0.124514  0.247532   \n",
       "3  25937  0.113138  0.101136  0.022262  0.490474  0.083402  0.317217   \n",
       "4  45311  0.587813 -0.492328 -0.099530  0.238831  0.166449 -0.072206   \n",
       "\n",
       "      pca_6     pca_7     pca_8  ...    pca_41    pca_42    pca_43    pca_44  \\\n",
       "0 -0.052340 -0.004493 -0.124143  ...  0.330419  0.142985 -0.090690  0.039132   \n",
       "1 -0.145977  0.062874 -0.020713  ...  0.017201  0.033115  0.124033 -0.082698   \n",
       "2 -0.088920 -0.123791 -0.075584  ... -0.066895 -0.005588 -0.084927 -0.120549   \n",
       "3  0.020204  0.241452  0.033521  ...  0.032577 -0.044332 -0.095617 -0.126468   \n",
       "4 -0.066623 -0.124674  0.113983  ...  0.049908 -0.036065 -0.117997 -0.014939   \n",
       "\n",
       "     pca_45    pca_46    pca_47    pca_48    pca_49   target  \n",
       "0 -0.218198  0.355507  0.019291 -0.111644  0.076873  Class_9  \n",
       "1 -0.016145 -0.001503  0.042375  0.148873 -0.189273  Class_9  \n",
       "2  0.104887 -0.052950  0.065886 -0.009710  0.167165  Class_2  \n",
       "3 -0.115677 -0.037080  0.082948  0.002218  0.052820  Class_3  \n",
       "4  0.065577  0.130757  0.119902  0.090398  0.057985  Class_6  \n",
       "\n",
       "[5 rows x 52 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "\n",
    "train = pd.read_csv(dpath +\"Otto_FE_train_PCA.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10000 entries, 0 to 9999\n",
      "Data columns (total 52 columns):\n",
      "id        10000 non-null int64\n",
      "pca_0     10000 non-null float64\n",
      "pca_1     10000 non-null float64\n",
      "pca_2     10000 non-null float64\n",
      "pca_3     10000 non-null float64\n",
      "pca_4     10000 non-null float64\n",
      "pca_5     10000 non-null float64\n",
      "pca_6     10000 non-null float64\n",
      "pca_7     10000 non-null float64\n",
      "pca_8     10000 non-null float64\n",
      "pca_9     10000 non-null float64\n",
      "pca_10    10000 non-null float64\n",
      "pca_11    10000 non-null float64\n",
      "pca_12    10000 non-null float64\n",
      "pca_13    10000 non-null float64\n",
      "pca_14    10000 non-null float64\n",
      "pca_15    10000 non-null float64\n",
      "pca_16    10000 non-null float64\n",
      "pca_17    10000 non-null float64\n",
      "pca_18    10000 non-null float64\n",
      "pca_19    10000 non-null float64\n",
      "pca_20    10000 non-null float64\n",
      "pca_21    10000 non-null float64\n",
      "pca_22    10000 non-null float64\n",
      "pca_23    10000 non-null float64\n",
      "pca_24    10000 non-null float64\n",
      "pca_25    10000 non-null float64\n",
      "pca_26    10000 non-null float64\n",
      "pca_27    10000 non-null float64\n",
      "pca_28    10000 non-null float64\n",
      "pca_29    10000 non-null float64\n",
      "pca_30    10000 non-null float64\n",
      "pca_31    10000 non-null float64\n",
      "pca_32    10000 non-null float64\n",
      "pca_33    10000 non-null float64\n",
      "pca_34    10000 non-null float64\n",
      "pca_35    10000 non-null float64\n",
      "pca_36    10000 non-null float64\n",
      "pca_37    10000 non-null float64\n",
      "pca_38    10000 non-null float64\n",
      "pca_39    10000 non-null float64\n",
      "pca_40    10000 non-null float64\n",
      "pca_41    10000 non-null float64\n",
      "pca_42    10000 non-null float64\n",
      "pca_43    10000 non-null float64\n",
      "pca_44    10000 non-null float64\n",
      "pca_45    10000 non-null float64\n",
      "pca_46    10000 non-null float64\n",
      "pca_47    10000 non-null float64\n",
      "pca_48    10000 non-null float64\n",
      "pca_49    10000 non-null float64\n",
      "target    10000 non-null object\n",
      "dtypes: float64(50), int64(1), object(1)\n",
      "memory usage: 4.0+ MB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 将类别字符串变成数字\n",
    "# drop ids and get labels\n",
    "y_train = train['target']   #形式为Class_x\n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#保存特征名字以备后用（可视化）\n",
    "feat_names = X_train.columns \n",
    "\n",
    "#sklearn的学习器大多之一稀疏数据输入，模型训练会快很多\n",
    "from scipy.sparse import csr_matrix\n",
    "X_train = csr_matrix(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 训练样本6w+，交叉验证太慢，用train_test_split估计模型性能\n",
    "# SVM对大样本数据集支持不太好\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train_part, X_val, y_train_part, y_val = \\\n",
    "    train_test_split(X_train, y_train, train_size = 0.8,random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8000, 50)\n"
     ]
    }
   ],
   "source": [
    "print (X_train_part.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### RBF核SVM正则参数调优\n",
    "\n",
    "RBF核是SVM最常用的核函数。\n",
    "RBF核SVM 的需要调整正则超参数包括C（正则系数，一般在log域（取log后的值）均匀设置候选参数）和核函数的宽度gamma\n",
    "C越小，决策边界越平滑； \n",
    "gamma越小，决策边界越平滑。\n",
    "\n",
    "采用交叉验证，网格搜索步骤与Logistic回归正则参数处理类似，在此略。\n",
    "\n",
    "这里我们用校验集（X_val、y_val）来估计模型性能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def fit_grid_point_RBF(C, gamma, X_train, y_train, X_val, y_val):\n",
    "    \n",
    "    # 在训练集是那个利用SVC训练\n",
    "    SVC3 =  SVC( C = C, kernel='rbf', gamma = gamma)\n",
    "    SVC3 = SVC3.fit(X_train, y_train)\n",
    "    \n",
    "    # 在校验集上返回accuracy\n",
    "    accuracy = SVC3.score(X_val, y_val)\n",
    "    \n",
    "    print(\"C= {} and gamma = {}: accuracy= {} \" .format(C, gamma, accuracy))\n",
    "    return accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "accuracy_s = np.matrix(np.zeros(shape=(5, 3)), float)\n",
    "gamma_s = np.logspace(-1, 1, 3)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 0.1 and gamma = 0.1: accuracy= 0.702 \n",
      "C= 0.1 and gamma = 1.0: accuracy= 0.726 \n",
      "C= 0.1 and gamma = 10.0: accuracy= 0.475 \n"
     ]
    }
   ],
   "source": [
    "oneC = 0.1\n",
    "\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    accuracy_s[0,j] = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 1 and gamma = 0.1: accuracy= 0.7275 \n",
      "C= 1 and gamma = 1.0: accuracy= 0.759 \n",
      "C= 1 and gamma = 10.0: accuracy= 0.69 \n"
     ]
    }
   ],
   "source": [
    "oneC = 1\n",
    "\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    accuracy_s[1,j] = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 10 and gamma = 0.1: accuracy= 0.754 \n",
      "C= 10 and gamma = 1.0: accuracy= 0.76 \n",
      "C= 10 and gamma = 10.0: accuracy= 0.6905 \n"
     ]
    }
   ],
   "source": [
    "oneC = 10\n",
    "\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    accuracy_s[2,j] = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 100 and gamma = 0.1: accuracy= 0.7675 \n",
      "C= 100 and gamma = 1.0: accuracy= 0.739 \n",
      "C= 100 and gamma = 10.0: accuracy= 0.6875 \n"
     ]
    }
   ],
   "source": [
    "oneC = 100\n",
    "\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    accuracy_s[3,j] = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 1000 and gamma = 0.1: accuracy= 0.75 \n",
      "C= 1000 and gamma = 1.0: accuracy= 0.731 \n",
      "C= 1000 and gamma = 10.0: accuracy= 0.688 \n"
     ]
    }
   ],
   "source": [
    "oneC = 1000\n",
    "\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    accuracy_s[4,j] = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#需要调优的参数\n",
    "#C_s = np.logspace(-1, 3, 5)# logspace(a,b,N)把10的a次方到10的b次方区间分成N份 \n",
    "#gamma_s = np.logspace(-1, 1, 3)    \n",
    "\n",
    "#accuracy_s = []\n",
    "#for i, oneC in enumerate(C_s):\n",
    "#    for j, gamma in enumerate(gamma_s):\n",
    "#        tmp = fit_grid_point_RBF(oneC, gamma, X_train_part, y_train_part, X_val, y_val)\n",
    "#        accuracy_s.append(tmp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上述结果会发现，gamma参数非常重要(当gamma=0.01或gamma=100时性能很差),非线性模型比线性模型性能好（注意我们这里只用了tfidf特征）。\n",
    "但速度慢了不是一点半点(sklearn建议核方法SVM样本数不超过10000)\n",
    "可以考虑将训练样本分为多个子集，每个子集训练一个RBF核SVM模型，最后多个模型融合的结果的到最终模型（训练速度加快，但测试可能更慢）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de3wU5b348c93cyF3QiAmgYgRQQIiIERAtBSpWJSbh0MPXqtW8fjzqEfrsYo9tT1RFFS8tFItByxoK9p6pRYvqFRbipCIQSDchAMmJAEMkARy22Sf3x872WySTdiE3UzCft+v1752Z+aZ2W8myfOdeZ6ZZ8QYg1JKqdDlsDsApZRS9tJEoJRSIU4TgVJKhThNBEopFeI0ESilVIgLtzuA9urTp4/JyMiwOwyllOpWvvzyy++MMcm+lnW7RJCRkUFubq7dYSilVLciIvtbW6ZNQ0opFeI0ESilVIjTRKCUUiFOE4FSSoU4TQRKKRXiNBEopVSI00SglFIhrtvdR6CUT8aAqw5c9dZ788+tzfMxbfwo42u7ALF9IL4vJKS532OTwaHHW6pr00SgWqqrhcM74MgeqHf6UQm6OlZxek+b9m7Dx/pdkSMc4lIgPq0xOSSkuafj0yChr/u9R5zdkaoQpokg1FUdg4NbofhrKNnifh3eAS5n+7fliHBXfI5wcIQ1++w93Xy59YqIbllGmm/nJNtoMc/hR5lWptv73cYFJw5BeTFUFFnv1qu8CA7vhD1/g9qKlvuuRwLEpzZNDgl9rXlW8og9A8L8/5d1uQxOlwtnvaGu3kVtvYu6eoOz3j3PaU2751vzXC6cdS7qXO7ltV6fG9dxUWtts/m2nPUunC5jbaPtcvXGkBAVQVJsZItXr5hIesdZ77GR9IyOwOGQ9v9NKr9oIggVxkBZQWNlX7IFSr6GY982lolNhtThMPAHkHo+JA+GiBiQ5pWprwo4dJs/6updlJRVc7C8htq6Hjjr++MkHWeswRnloq63C2ddYyUrtceJqDpIVNUhoqoOElNziJiaw8TVHCLu4GESCvKJrysljKZnOS4cHHX04ogjicPSm8MkcYheHCSJElcvil2JFLmSOFLfgzoX1LuC+/TBiDAh3OEgIkyICHMQEeYgPEyItN7d0w4irXLRkQ4iHFbZcAcOgfIqJ0dO1LL3u+McOV7Lidp6n9/lEOgVE0mvhmQRE0lSnPu9V6w7WTR/j4oIC+rPfzrRRHA6qquF73a2rPSry6wCAr0HQr8sGH2zu/JPPR/iU2wNu6uqrK2j6FgVhUerKDpWzYFjlRzwfK6ipLy6g5VuT0R6EuHIdFem4Q7CHe6KMzLC0MdRQaocJUWOkGyOkMwRertK6e0q5az6EkbWbyXGdbzpJgVqI6M5HpnMiR7JVPY4g6qoM6iOTqEmOgVnzBnURqdSH5dCeHhks8q77Uo9wmt5uEMQCfwRerWznqOVtRw50fR19EQtpSdqOVpZS+lxd+LI3V/L0Upnq/s+OiLM59mGnnW0pImgu6sug5KtXhX+Zjjk1bQTHgUp58F5s9yVfepwSBkKkbH2xt1FGGM4WunkwNEqdwV/rNrrs7uyP3Kitsk6YQ4hNSGKfonRjDk7iX6J0fTrFU1qQhRREWGNleXJKliHEHaqFWptpVfzk/s9sqKYpPIikiqKoXwbfPexj6Y+cZ8B+uqvaJgXmwbRvSAIFX5roiLCSOsZTVrPaL/Ku1yG8mpni8RxpLKWI8etdyuR7Dl8nKMnOnbW0VoyOV3OOqS7Pbw+KyvLhOToo8ZAWWHTI/ySLXDMa0DBmD6QNryxwk89H5LOaVe78ummrt7FwYqaxsr9aJW7sj9WxYGjlRQdq6bK2bRiiI4Io1+vaPolRtM3MZp063O/Xu7plPgehId1o6Ywlwuqjrj7KRr6KypKWvZjVJa2XDc8qll/ha8+jDQI79H5P1cHNZx1lB6v9Xn20eRMxFre2glfTGQYvdpIFF3prENEvjTGZPlcpomgC6p3ujsWm1f61cesAgK9z7EqfK9KPy6lU4/euoKq2voWR/JFns++m22SYiPdFbtX5d7PqvD7JkbTKyYiKM0eXZ6zGo6XNHZ2V5R4JQ+veXXVLdeN6e11RVRq00toG84wYnp3y7/PhrOOUu8mKq93zxmIVwKp7IJnHW0lgtA9VOwqqsvg4LamFf6h7VBvNUeER8EZQ2HoTOtof7h7OgQuN2zabGO9vCv7Y1V+Ndv0tSr8hso/OvL0OJ0PuIgo6JXhfrXGGKg62qQpqvEsw3ovyoMTh4FmB5lhkY1JIj7V/QqLdCcHcQDWuzh8zKNxmWdea+vJSbbltQxOsi0HDoREcZDYEEO0A6IFkpuvFwHSA0SoqTeUV9dRVl1PWVUd5TX1HKtyfz5WXUVZVQVHquooK6lnW5WTY1V11CMYIxgEF4ILBwaIigwnITqSxNge/HjCefxwZBu/nw7SRNBZjIHyAy2P8o/uaywT09td0Y+9vfEov/fA07Zpp3mzTdGxagqtSr/IqvTbarYZ1q+np9mmobLvds023Y0IxCS5XynntV6u3mk1PxU3PaNoSB4Ht8I3n3jdA2Lc78Y0TndjPYBk6+X3Cm2pcb92Fv4PjLznVELz6fSsYexW74TvdjdW9g3vVUcbyySdA2kj4YIbvK7aSe2Wp86tcTfb+DiS96PZZmByHBMGJXsq/ZBvtuluwiIg8Uz3q6NMs+TQWsLwTPua19p6NC476Xq+tm1PDIMzLu74/myDJoJTVV3eStNOjXt5WA/3VTpDpjdW+CnnQY94e+M+RQ3NNg2XVTZU9kXHGptxtNlGnRIR9019Kug0EfjLGHf7Z4umnf9rLBOd5G7HH3ubV9POoG7ZtONyGUrKq61r590VuzbbKHV66n41VGeor4PS3e6KvnhzY+VfdaSxTNIAd6V/wXVeTTtp3bJpp6zKyc6SCnaWlLO9pIIdxeXsOnic4zV1Tcpps41SpydNBDUVLZt2DuY3bdo5YwhkTm3atBOVYG/cHVBX7+L/vjvhqex3lFSws6SCA8eqPGUSosLJTEvgX0f1Y1BKPGcmxWizjVKnudBJBMa4r1bwrvBLtsCRvY1lonu5K/sxcxsr/T6D3B1f3YgxhsPHa9hRXMGOEneFv6O4gm8OHae23t1JFe4QzkmOIyujF9el9mdIagKZafGkJkTpUb1SISZ0EsHfn4JPH22c7pXhruhHXNt4Y1ZC327XtFPtrGfXwQpPZd9Q8Xt31KYk9GBwagLfG9SHzLR4MlMTGJAcS49wPcJXSoVSIhh4mXuoX0/TTk+7I2oXl8tw4FgV260mnYYKf993Jzy3v0dFOBicEs/kISlkpsUzONVd6SfFRtobvFKqSwudRND3AverG2jovG1s1ilnZ0lFk8GyzuodQ2ZqPNOH9yUzNZ7MtAT6J8UQFqKjJyqlOi50EkEX5LQ6b3d4dd7uKC6nqKxxLJee0RFkpsYze3Q6mWkJZKbGc25KPLE99FenlAoMrU06gTGGwxU1jU06xe42/eadtwPPiOPCs5PItDpuM1O181YpFXyaCAKsqrae3YfcHbfbS8qtJp6WnbeZqQl879w+7mad1ATOSY4jMlxvtlJKdT5NBB3kchkKj1Z5VfbuI/3/Kz1Bw8je0RFhnJsaz+VDUzwdt5mp8fTSzlulVBeiicAPZZVOdpSUs/NgBdutSzR3eXXeisBZSTEMTo1n+oi+DEmLZ3Cqdt4qpboHTQReGjpvt3vdddta5+2Pss60jvK181Yp1b2FZO3V0Hm73evSzO0lFezx0Xk75uwkBludt0NSE0hJ6KGdt0qp00pQE4GITAGeA8KApcaYBc2WPwNcak3GAGcYYxKDEUvOviOs3lLMjuIKdh5s2nmbmhDF4NR4JpzbhyGpCQxOjdfOW6VUyAhaIhCRMGAxMBkoBHJEZJUxJr+hjDHmXq/ydwFBu+Nr24EyXttY4Om8zUx1t+Nr561SKtQF84xgDPCNMWYvgIi8BswE8lspfw3wy2AFc83Y/vz4ogwc2nmrlFJNBLPtox9Q4DVdaM1rQUTOAs4GPm1l+W0ikisiuYcPH+5QMD3CwzQJKKWUD8FMBL5q3daeSH018IYxpt7XQmPMEmNMljEmKznZ78dBK6WU8kMwE0Eh4P3k6nSgqJWyVwMrgxiLUkqpVgQzEeQAg0TkbBGJxF3Zr2peSEQGA72A9UGMRSmlVCuClgiMMXXAncCHwHbgT8aYbSKSLSIzvIpeA7xmjGmt2UgppVQQBfU+AmPMamB1s3kPN5v+VTBjUEop1Ta9Y0oppUKcJgKllApxmgiUUirEaSJQSqkQp4lAKaVCnCYCpZQKcZoIlFIqxGkiUEqpEKeJQCmlQpwmAqWUCnGaCJRSKsRpIlBKqRCniUAppUKcJgKllApxmgiUUirEaSJQSqkQp4lAKaVCnCYCpZQKcZoIlFIqxGkiUEqpEKeJQCmlQpwmAqWUCnGaCJRSKsRpIlBKqRCniUAppUKcJgKllApxmgiUUirEaSJQSqkQp4lAKaVCnCYCpZQKcZoIlFIqxGkiUEqpEKeJQCmlQpwmAqWUCnGaCJRSKsSF2x2AUqopYwwu48JlXNSbelzGRZ2pw+VqnPa8u+pbzmtYx1Xnc35b6/hcZs3zNb9hus15pr5F7A2fVftck3kNE9InBHy7mgiUshhjqK6v5oTzBJXOSirrKj2fT9RZ85zWPK9lDZ+r6qraV2k2qyDrTJ0nAXQHDnEQJmGESZjns8PhaDLt+exoOc8hDgSx+8foVmrqa4KyXU0Eqtuqd9VTVVfFCecJTtSdoMpZ1aSSbqicvSvzE06rnFWxe1fmlXWVflfCEY4IYiNiiQmPISYihtiIWKLDowlzhHkqu45WkN7zvJf5nO9jnkMchDvCPZVta8uaz2+IvcUyh4NwabqOQxyIaCV+ughqIhCRKcBzQBiw1BizwEeZfwN+BRhgszHm2mDGpOzjrHc2qaQ9R9JWxeypsL0q8IZy3kfeDcur6qr8/u7o8OgmlXZMeAy9onqRHp9OTLg1LyKmyefYcGue1+eGdSPCIoK4p5TqXEFLBCISBiwGJgOFQI6IrDLG5HuVGQTMAy42xhwVkTOCFY9qv5r6Go7XHm/SNOJdMbfWRNKiKcX67HQ5/fpehzg8lbZ3xZwak+q7YvYu56NSbzhSV0r5FswzgjHAN8aYvQAi8howE8j3KjMXWGyMOQpgjDkUxHhUO3y470Me/PuD1LnqTlo23BFObERskyPomPAYkqOTPRWxdyUdGxFLdER0k8o8NtyaFxFLVFiUNjso1YmCmQj6AQVe04XA2GZlzgUQkXW4m49+ZYz5oPmGROQ24DaA/v37ByVY1aispozHNjzGwMSBzBo0q82mk9iIWG0mUaqbC2Yi8HVIZ3x8/yBgIpAO/F1EhhljjjVZyZglwBKArKys5ttQAfbspmcpqyljyeQlDE4abHc4SqkgC+YNZYXAmV7T6UCRjzLvGmOcxpj/A3biTgzKJnmH8nhj1xtcN+Q6TQJKhYhgJoIcYJCInC0ikcDVwKpmZd4BLgUQkT64m4r2BjEm1Qany0n2F9mkxKRwx8g77A5HKdVJ/EoEIvKmiEwVEb8ThzGmDrgT+BDYDvzJGLNNRLJFZIZV7EOgVETygbXA/caY0vb9CCpQXt3+KruP7mbemHnERsTaHY5SqpOIMSdvcheRy4CbgXHAn4HlxpgdQY7Np6ysLJObm2vHV5/Wio8XM/PdmYxJHcNvJv1Gr9pR6jQjIl8aY7J8LfPrCN8Y87Ex5jpgFLAPWCMi/xSRm0VELxk5DTy+8XEAHhr7kCYBpUKM3009ItIbuAm4FfgK9x3Do4A1QYlMdZpPv/2UtQVruX3E7fSN62t3OEqpTubX5aMi8haQCbwCTDfGFFuLXhcRbafpxiqdlTy+8XEGJg7khqE32B2OUsoG/t5H8Lwx5lNfC1prc1Ldw4ubX6TkRAkrpqwgwqGtfEqFIn+bhoaISGLDhIj0EhG9vrCb23V0Fy/nv8ysQbMYlTLK7nCUUjbxNxHM9b7b1xobaG5wQlKdwWVcPLL+EeIj47l31L12h6OUspG/icAhXpeSWCOLRgYnJNUZ3t79NnmH87gv6z4SoxJPvoJS6rTlbx/Bh8CfRORF3OMF3Q60GBxOdQ9Hqo/w9JdPMzplNDPPmWl3OEopm/mbCB4A/h34f7gHk/sIWBqsoFRwLcpdRGVdJQ+Pe1jvGVBK+ZcIjDEu4AXrpbqxnJIcVu1Zxdzz5zIgcYDd4SilugB/7yMYBDwODAWiGuYbY7Qm6UZq62vJXp9Nv7h+zB2uff1KKTd/O4t/j/tsoA73aKEv4765THUjy7ctZ1/5Ph4a+xDR4dF2h6OU6iL8TQTRxphPcA9St98Y8ytgUvDCUoFWUF7Akq+XMPmsyUxIn2B3OEqpLsTfzuJqawjq3SJyJ3AA0AfNdxPGGOZvnE+YhPHAhQ/YHY5Sqovx94zgHiAGuBsYDVwP3BisoFRgfbT/I9YdWMddF9xFSmyK3eEopbqYk54RWDeP/Zsx5n7gOO7nEqhu4njtcRZuXMiQpCFcnXm13eEopbqgkyYCY0y9iIwWETH+PMVGdSnP5z3Pd1Xf8dylzxHu8LclUCkVSvytGb4C3hWRPwMnGmYaY94KSlQqILaVbmPljpXMGTyH85PPtzscpVQX5W8iSAJKaXqlkAE0EXRR9a56stdnkxSVxN2j7rY7HKVUF+bvncXaL9DNvL7zdfJL83liwhPER8bbHY5Sqgvz987i3+M+A2jCGPOTgEekTtmhykP8+qtfc1HaRUzJmGJ3OEqpLs7fpqH3vD5HAf8CFAU+HBUIT+Y8ibPeyc/H/VwHlVNKnZS/TUNvek+LyErg46BEpE7JugPr+GDfB9wx8g7OSjjL7nCUUt2AvzeUNTcI6B/IQNSpq66rZv6G+WQkZHDLsFvsDkcp1U3420dQQdM+ghLczyhQXcjSLUspqChg6eVLiQzTB8gppfzjb9OQXnbSxe0t28uyrcuYNmAaY9PG2h2OUqob8atpSET+RUR6ek0nishVwQtLtYcxhke/eJTo8Gjuy7rP7nCUUt2Mv30EvzTGlDVMGGOOAb8MTkiqvd7b+x45JTncM+oe+kT3sTscpVQ3428i8FVOB67pAspqyngq9ymGJw9n9rmz7Q5HKdUN+VuZ54rI08Bi3J3GdwFfBi0q5bdnNz1LWU0ZSyYvwSEdvQhMKRXK/K057gJqgdeBPwFVwH8EKyjln7xDebyx6w2uG3Idg5MG2x2OUqqb8veqoRPAg0GORbWD0+Uk+4tsUmJSuGPkHXaHo5Tqxvy9amiNiCR6TfcSkQ+DF5Y6mVe3v8ruo7uZN2YesRGxdoejlOrG/G0a6mNdKQSAMeYo+sxi2xQfL2Zx3mK+n/59JvWfdPIVlFKqDf4mApeIeIaUEJEMfIxGqjrH4xsfB+ChsQ/poHJKqVPm71VDPwf+ISKfWdMTgNuCE5Jqy6fffsragrXcO/pe+sb1tTscpdRpwN/O4g9EJAt35Z8HvIv7yiHViSqdlTy+8XEGJg7khqE32B2OUuo04e+gc7cC/wmk404E44D1NH10pQqyFze/SMmJElZMWUGEI8LucJRSpwl/+wj+E7gQ2G+MuRS4ADh8spVEZIqI7BSRb0SkxeWnInKTiBwWkTzrdWu7og8hu47u4uX8l5k1aBajUkbZHY5S6jTibx9BtTGmWkQQkR7GmB0i0uYdTCIShvtO5MlAIZAjIquMMfnNir5ujLmz/aGHDpdx8cj6R4iPjOfeUffaHY5S6jTj7xlBoXUfwTvAGhF5l5M/qnIM8I0xZq8xphZ4DZjZ8VBD19u73ybvcB73Zd1HYlTiyVdQSql28Lez+F+sj78SkbVAT+CDk6zWDyjwmi4EfA2U/68iMgHYBdxrjCloXkBEbsO6Sql//9B6MNqR6iM8/eXTjE4ZzcxzNI8qpQKv3aOUGWM+M8asso7y2+LrAvfm9x78BcgwxgzH/QzkFa185xJjTJYxJis5Obm9IXdri3IXUVlXycPjHtZ7BpRSQRHM4SoLgTO9ptNp1pxkjCk1xtRYk/8LjA5iPN1OTkkOq/as4ubzbmZA4gC7w1FKnaaCmQhygEEicraIRAJXA6u8C4hImtfkDGB7EOPpVmrra8len02/uH7MHT7X7nCUUqexoD1cxhhTJyJ3Ah8CYcBLxphtIpIN5BpjVgF3i8gMoA44AtwUrHi6m+XblrOvfB+Lf7CY6PBou8NRSp3GgvqUMWPMamB1s3kPe32eB8wLZgzdUUF5AUu+XsLksyYzIX2C3eEopU5z+kirLsYYw/yN8wmTMB648AG7w1FKhQBNBF3MR/s/Yt2Bddx1wV2kxKbYHY5SKgRoIuhCjtceZ+HGhQxJGsLVmVfbHY5SKkQEtY9Atc/zec/zXdV3PHfpc4Q79FejlOocekbQRWwr3cbKHSuZM3gO5yefb3c4SqkQoomgC6h31ZO9PpukqCTuHnW33eEopUKMJoIu4PWdr5Nfms/PLvwZ8ZHxdoejlAoxmghsdqjyEL/+6tdclHYRUzKm2B2OUioEaSKw2ZM5T+Ksd/LzcT/XQeWUUrbQRGCjdQfW8cG+D7h1+K2clXCW3eEopUKUJgKbVNdVM3/DfDISMrhl2C12h6OUCmF6sbpNlm5ZSkFFAUsvX0pkWKTd4SilQpieEdhgb9lelm1dxrQB0xib5uuhbUop1Xk0EXQyYwyPfvEo0eHR3Jd1n93hKKWUJoLO9t7e98gpyeGeUffQJ7qP3eEopZQmgs5UVlPGU7lPMTx5OLPPnW13OEopBWhncad6dtOzlNWUsWTyEhyiOVgp1TVobdRJ8g7l8cauN7huyHUMThpsdzhKKeWhiaATOF1Osr/IJiUmhTtG3mF3OEop1YQ2DXWCV7e/yu6ju3l24rPERsTaHY5SSjWhZwRBVny8mMV5i/l++veZ1H+S3eEopVQLmgiC7PGNjwPw0NiHdFA5pVSXpIkgiD799lPWFqzl9hG30zeur93hKKWUT5oIgqTSWcnjGx9nYOJAbhh6g93hKKVUq7SzOEhe3PwiJSdKWDFlBRGOCLvDUUqpVukZQRDsOrqLl/NfZtagWYxKGWV3OEop1SZNBAHmMi4eWf8I8ZHx3DvqXrvDUUqpk9KmoQB7e/fb5B3O45GLHyExKtHucFQX5nQ6KSwspLq62u5Q1GkkKiqK9PR0IiL8b5LWRBBAR6qP8PSXTzM6ZTQzz5lpdziqiyssLCQ+Pp6MjAy9tFgFhDGG0tJSCgsLOfvss/1eT5uGAmhR7iIq6yp5eNzD+o+tTqq6uprevXvr34oKGBGhd+/e7T7L1EQQIDklOazas4qbz7uZAYkD7A5HdROaBFSgdeRvShNBANTW15K9Ppt+cf2YO3yu3eEopVS7aCIIgOXblrOvfB8PjX2I6PBou8NRqt3Gjh3LyJEj6d+/P8nJyYwcOZKRI0eyb9++dm3nrbfeYseOHe3+/ksuuYS8vLx2r9fgqaee4tVXX+3w+p3hRz/6EXv37vW57NChQ0ycOJHY2FjuueeeVrdRWlrKD37wAwYNGsQPf/hDysrKAhKbJoJTVFBewJKvlzD5rMlMSJ9gdzhKdciGDRvIy8sjOzubOXPmkJeXR15eHhkZGe3aTkcTwalwOp288sorzJkzp1O/t71uv/12nnzySZ/LYmJimD9/PgsXLmxzG/Pnz+eKK65g9+7dfO973+OJJ54ISGx61dApMMYwf+N8wiSMBy58wO5wVDf2P3/ZRn5ReUC3ObRvAr+cft4pb+f9998nOzubmpoaBg0axEsvvURsbCz3338/f/3rXwkPD+eKK65g2rRprF69mnXr1vGrX/2Kd955p92JBOAPf/gDCxcuxBjDjBkzeOyxxwD43e9+x6JFi+jbty8DBw4kLi6OZ599ljVr1nDhhRcSFhYGwBdffMFtt91GXFwcF198MWvWrCEvL489e/Zw0003cfz4cRwOB7/97W8ZO3YsH3/8MfPnz6d3795s3ryZOXPmcO655/Kb3/yGmpoaVq1aRUZGBtdffz09e/YkPz+fb7/9lt///vcsW7aMDRs2cPHFF7Ns2TIAbrvtNjZt2kRVVRVz5szh4YcfBmDixInceuut1NfXe2Jt0BDr9u3b29w37777Ll988QUAN954I1OmTGH+/Pnt3sfNaSI4BR/t/4h1B9bxwIUPkBKbYnc4SgXcoUOHWLBgAZ988onnqPW5557jlltuYfXq1Wzbtg0R4dixYyQmJnLllVcye/Zsrrrqqg59X2FhIf/93/9Nbm4uPXv25LLLLuO9995jxIgRLFiwgE2bNhEbG8vEiRMZM2YMAOvWrWP06NGebdx8882sWLGCMWPG8F//9V+e+WlpaaxZs4aoqCh27NjBjTfeyIYNGwDYvHkz27dvp2fPnmRkZHDHHXeQk5PDokWLeP7553nqqacAKCsrY+3atbz55ptMnz6d9evXk5mZyahRo9i6dSvDhg1jwYIFJCUlUVdXx6WXXsrs2bMZOnQoYWFhZGRksHXrVkaMGNGh/VNaWkpycjIA/fr1o7i4uEPbaU4TQQcdrz3Owo0LGZI0hKszr7Y7HNXNBeLIPRj++c9/kp+fz/jx4wGora3lkksuISkpCYfDwdy5c5k6dSrTpk0LyPdt2LCBSZMm0adPHwCuvfZaPv/8c6qrq5k0aRK9evUCYPbs2Xz77bcAFBcXc8EFFwDw3XffUVtb60kS1157LR9//DEANTU13HnnnWzevJnw8HD27Nnj+d6xY8eSkuI+mBswYAA//OEPATj//PNZv369p9z06dM98/v27cvQoUMBGDp0KPv27WPYsGGsXLmSZcuWUVdXR1FREfn5+Z5yZ5xxBkVFRR1OBM0F6qoz7SPooOfznue7qu94+KKHCXdoPlWnJ2MMU6ZM8fQZ5Ofns2TJEiIiIsjNzeWqq67izTffZOrUqW1up7a21tMBnZ2d3eb3tWc+QHR0tOe6+bbKLVq0iDPPPJMtW7awceNGampqPMt69Ojh+exwODzTDoeDurq6FuW8y3iX2/5V4DMAABKTSURBVL17N8899xyffvopX3/9NVOmTGlyTX91dTXR0dG88cYbnv3Rnk7y3r17c/jwYQAOHDhAamqq3+u2JaiJQESmiMhOEflGRB5so9xsETEikhXMeAJlW+k2Vu5YyZzBcxjWZ5jd4SgVNOPHj+ezzz7zXO1y4sQJdu/eTUVFBeXl5UybNo1nnnmGr776CoD4+HgqKipabCcyMtKTTBrazH0ZN24ca9eupbS0lLq6Ol577TW+//3vM3bsWNauXcuxY8dwOp289dZbnnWGDBnCN998A0BycrInSQG89tprnnJlZWWkpaUhIqxYsaLNpNFR5eXlxMfHk5CQQHFxMR9++GGT5bt37+a8885j9uzZnv0xcuRIv7c/Y8YMVqxYAcCKFSuYOTMwIxgELRGISBiwGLgCGApcIyJDfZSLB+4GNgQrlkCqd9WTvT6bpKgk7h51t93hKBVUKSkpLFu2jDlz5jBixAjGjx/Prl27KCsrY+rUqYwYMYJJkybx9NNPA3DNNdfw2GOPdejSU4D09HSys7OZOHEiI0eOZNy4cUydOpX+/ftz//33M2bMGC6//HLOO+88evbsCcCVV17JZ5995tnGSy+9xM0338z48eNxOByecnfeeSdLly5l3Lhx7N+/v8kRfaCMGjWKoUOHMmzYMObOncvFF1/sWVZUVETPnj09bfy+fvaf/exnLFu2jPT0dHbu3Am4+zwazhoeeugh/vrXvzJo0CA+//xz7r///sAEbowJygu4CPjQa3oeMM9HuWeBacDfgKyTbXf06NHGTn/M/6MZtnyYWb13ta1xqO4vPz/f7hC6lYqKCmOMMbW1teaKK64wq1at8iybPn262bNnT5Nyxhjz6KOPmp/+9KedG2grnnjiCbN8+fJO+S5ff1tArmmlXg1m01A/oMBrutCa5yEiFwBnGmPea2tDInKbiOSKSG5D+5gdDlUe4tdf/ZqL0i5iSsYU2+JQKhT94he/4IILLmD48OEMHjy4SQf1woULKSoqAmDVqlWMHDmSYcOGsX79eubNm2dXyE307t2b66+/3u4wfApmL6ev7mxPo5yIOIBngJtOtiFjzBJgCUBWVlbgG/b89GTOkzjrnfx83M91jBilOtkzzzzT6rIhQ4Z4Pl977bVce+21nRFSu/zkJz+xO4RWBfOMoBA402s6HSjymo4HhgF/E5F9wDhgVVftMF53YB0f7PuAW4ffylkJZ9kdjlJKBUwwE0EOMEhEzhaRSOBqYFXDQmNMmTGmjzEmwxiTAXwBzDDG5AYxpg6prqtm/ob5ZCRkcMuwW+wORymlAipoTUPGmDoRuRP4EAgDXjLGbBORbNydFqva3kLXsXTLUgoqClh6+VIiwyLtDkcppQIqqHdCGWNWA6ubzfN5EbExZmIwY+movWV7WbZ1GdMGTGNs2li7w1FKqYDTO4vbYIzh0S8eJTo8mvuy7rM7HKWCRoehDr7mw1Dn5OQwbNgwBg4cyL333utzHWMMd9xxBwMHDmTEiBGntI/aoomgDe/tfY+ckhzuGXUPfaL72B2OUkGjw1AHX/NhqG+//XZ+//vfs3v3brZt28aaNWtarPOXv/yFgoICvvnmGxYvXsx//Md/BCU2HSSnFWU1ZTyV+xTDk4cz+9zZdoejTnfvPwglWwK7zdTz4YoFp7wZHYY68MNQFxUVUV1dzYUXXgjADTfcwDvvvMPkyZOb7It3332XH//4x4D7rKmkpITDhw+3endyR2kiaMWzm56lrKaMJZOX4BA9cVKhSYehDs4w1FVVVZx5ZuPV9enp6Rw4cKDF/jhw4IDPcpoIOkHeoTze2PUGPx76YwYnDbY7HBUKAnDkHgw6DHVwhqFOTExs8bP7uknV+BgYLxg3s+qhbjNOl5PsL7JJiUnhjpF32B2OUrYyOgx1UIahTk9Pp6CgcQSewsJC+vbt2yJmf8udKk0Ezby6/VV2H93NvDHziI2ItTscpWylw1C3j7/DUJ955pn06NGDnJwcjDG88sorPoeUnjFjBi+//DIA//jHP0hJSQl4sxBo01ATxceLWZy3mInpE5nUf5Ld4ShlO+9hqGtrawF47LHHiI6OZtasWdTU1OByuZoMQ/3v//7vLFq0qEOdxd7DUBtjmD59uudso2EY6n79+rUYhvqWWxrv+G8Yhjo+Pp4JEyY0GYZ69uzZrFy5kssuuyzow1APGDCgzWGoX3jhBW666Saqq6uZNm2ap6N48eLF9OjRg1tvvZXp06fz/vvvc8455xAbG+t5FkHAtTYsaVd9BXMY6rs+uctc+IcLzYGKA0H7DqUa6DDU7aPDUPuvKw1D3a18+u2nrC1Yy+0jbqdvXODb4JRSp0aHoQ4eMUFoJwumrKws09D+FyiVzkpmvjuTuIg4/jT9T0Q4IgK6faV82b59e5Phk5UKFF9/WyLypTHG5+jO2kcAvLj5RUpOlLBiygpNAkqpkBPyTUO7ju7i5fyXmTVoFqNSRtkdjlJKdbqQTgQu4+KR9Y8QHxnPvaN8D/qklFKnu5BOBG/vfpu8w3ncl3UfiVEt7/RTSqlQELKJ4Ej1EZ7+8mlGp4xm5jktb+RQKpToMNTB13wY6gcffJD09HSfw014e/TRRxk4cCCZmZme4TICLWQ7ixflLqKyrpKHxz2sD6JXIa9h8LXly5eTm5vL888/36HtvPXWWzgcDjIzMwMZXpsahqHetGlTp31nRzQMQ/3CCy8AMHPmTO68806GDRvW6jpff/01b731Fvn5+RQUFDBlyhR27tyJwxHYY/iQTAQ5JTms2rOKuefPZUDiALvDUYqFGxey40hgx/HPTMrkgTEPnPJ2dBjqwA9DHRYWxkUXXdRkHCNf3n33Xa655hoiIyM555xz6N+/P19++aVn+OpACbmmodr6WrLXZ9Mvrh9zh8+1OxylujTvYag3bdrE8OHDee655zh48KBnGOqvv/6aefPm8b3vfY8rr7ySZ555pkMPtYHGYajXrl3LV199xbp163jvvfcoKChgwYIFbNiwgY8++oj8/HzPOr6GoV66dCn//Oc/m4wn1DAM9VdffcUf//hH7r77bs+yzZs3s3jxYrZs2cLSpUvZt28fOTk53HjjjU3OjhqGoX7iiSeYPn06DzzwAPn5+Xz55Zds3boVgAULFpCbm8vmzZtZs2aNJ1bvYaj91dow1IEWcmcEy7ctZ1/5Phb/YDHR4dF2h6MUQECO3INBh6EOzjDUI0aM8Gt/+LrhNxhN2SGVCArKC1jy9RImnzWZCekT7A5HqS7PWMNQv/LKKy2W5ebmsmbNGl577TVeeOEFPvroo1a34105z5o1q9URSFsb6aCtERDaOwz1H/7wB5xOJ3FxcZ5lgR6GeuPGjSQmJnL99df7HIbaXzoMdYAZY5i/cT5hEsYDF3bNoy+luhodhrp9/B2G2l8zZsxg5cqV1NbWsmfPHvbv39+kGSxQQiYRfLT/I9YdWMddF9xFSmyK3eEo1S14D0M9YsQIxo8fz65duygrK2Pq1KmMGDGCSZMmNRmG+rHHHuvQpafQdBjqkSNHMm7cOKZOnUr//v09w1BffvnlLYah/uyzzzzbaBiGevz48TgcjibDUC9dupRx48axf//+oA9DPXfu3DaHof7pT39KRkYG5eXlpKen8+ijjwLw9ttvex7eM2LECK666iqGDBnClVdeyW9/+9uAXzEEhM4w1H8v/Lu565O7jLPe2aH1lQo0HYa6fXQYav+1dxjqkOkjuKTfJVzS7xK7w1BKddAvfvEL/va3v1FdXc2UKVN8DkM9YMAAVq1axRNPPEFdXR0ZGRksX77cvqC96DDUARSMYaiVsoMOQ62Cpb3DUIdMH4FSXVF3OxBTXV9H/qY0EShlk6ioKEpLSzUZqIAxxlBaWkpUVFS71guZPgKlupr09HQKCws5fPiw3aGo00hUVBTp6entWkcTgVI2iYiI4Oyzz7Y7DKW0aUgppUKdJgKllApxmgiUUirEdbv7CETkMLC/g6v3Ab4LYDiBonG1j8bVfl01No2rfU4lrrOMMcm+FnS7RHAqRCS3tRsq7KRxtY/G1X5dNTaNq32CFZc2DSmlVIjTRKCUUiEu1BLBErsDaIXG1T4aV/t11dg0rvYJSlwh1UeglFKqpVA7I1BKKdWMJgKllApxp3UiEJEficg2EXGJSKuXXInIFBHZKSLfiMiDnRBXkoisEZHd1nuvVsrVi0ie9VoVxHja/PlFpIeIvG4t3yAiGcGKpZ1x3SQih7320a2dFNdLInJIRLa2slxE5NdW3F+LyKguEtdEESnz2l+tPzw4cDGdKSJrRWS79b/4nz7KdPr+8jOuTt9f1vdGichGEdlsxfY/PsoE9n+ytUeXnQ4vYAgwGPgbkNVKmTBgDzAAiAQ2A0ODHNcTwIPW5weBha2UO94J++ikPz9wB/Ci9flq4PUuEtdNwPM2/F1NAEYBW1tZfiXwPiDAOGBDF4lrIvBeJ++rNGCU9Tke2OXj99jp+8vPuDp9f1nfK0Cc9TkC2ACMa1YmoP+Tp/UZgTFmuzFm50mKjQG+McbsNcbUAq8BM4Mc2kxghfV5BXBVkL+vLf78/N7xvgH8QESkC8RlC2PM58CRNorMBF42bl8AiSKS1gXi6nTGmGJjzCbrcwWwHejXrFin7y8/47KFtR+OW5MR1qv5VT0B/Z88rROBn/oBBV7ThQT/DyLFGFMM7j9I4IxWykWJSK6IfCEiwUoW/vz8njLGmDqgDOgdpHjaExfAv1rNCW+IyJlBjslfdvxN+esiq8nhfRE5rzO/2Gq+uAD3Ea43W/dXG3GBTftLRMJEJA84BKwxxrS6zwLxP9ntn0cgIh8DqT4W/dwY864/m/Ax75SvqW0rrnZspr8xpkhEBgCfisgWY8yeU42tGX9+/qDso5Pw5zv/Aqw0xtSIyO24j5AmBTkuf9ixv/yxCfd4M8dF5ErgHWBQZ3yxiMQBbwL3GGPKmy/2sUqn7K+TxGXb/jLG1AMjRSQReFtEhhljvPt+ArrPun0iMMZcdoqbKAS8jyTTgaJT3GabcYnIQRFJM8YUW6fAh1rZRpH1vldE/ob7qCXQicCfn7+hTKGIhAM9CX4TxEnjMsaUek3+L7AwyDH5Kyh/U6fKu6IzxqwWkd+KSB9jTFAHVxORCNyV7R+NMW/5KGLL/jpZXHbtr2YxHLP+96cA3okgoP+T2jQEOcAgETlbRCJxd7wE7QodyyrgRuvzjUCLMxcR6SUiPazPfYCLgfwgxOLPz+8d72zgU2P1UgXRSeNq1o48A3c7b1ewCvixdTXMOKCsoSnQTiKS2tCOLCJjcP//l7a91il/pwDLgO3GmKdbKdbp+8ufuOzYX9Z3JVtnAohINHAZsKNZscD+T3Z2j3hnvoB/wZ05a4CDwIfW/L7Aaq9yV+K+amAP7ialYMfVG/gE2G29J1nzs4Cl1ufxwBbcV8tsAW4JYjwtfn4gG5hhfY4C/gx8A2wEBnTS7+9kcT0ObLP20Vogs5PiWgkUA07r7+sW4Hbgdmu5AIutuLfQyhVrNsR1p9f++gIY3wkxXYK7yeJrIM96XWn3/vIzrk7fX9b3Dge+smLbCjxszQ/a/6QOMaGUUiFOm4aUUirEaSJQSqkQp4lAKaVCnCYCpZQKcZoIlFIqxGkiUMoHETl+8lJtrv+GdUc4IhInIr8TkT3WaJKfi8hYEYm0Pnf7GztV96aJQKkAs8akCTPG7LVmLcV91+cgY8x5uEdN7WPcg+l9AsyxJVClLJoIlGqDdbfrkyKyVUS2iMgca77DGnJgm4i8JyKrRWS2tdp1WHeLi8g5wFjgv40xLnAPGWKM+atV9h2rvFK20VNSpdo2CxgJjAD6ADki8jnuIT8ygPNxjx67HXjJWudi3Hf5ApwH5Bn3IGK+bAUuDErkSvlJzwiUatsluEc4rTfGHAQ+w11xXwL82RjjMsaU4B7iokEacNifjVsJolZE4gMct1J+00SgVNtae9hHWw8BqcI9Fgy4x6oZISJt/a/1AKo7EJtSAaGJQKm2fQ7MsR4Ukoz7cZAbgX/gfiiOQ0RScD/WsMF2YCCAcT8/Ihf4H6+RLAeJyEzrc2/gsDHG2Vk/kFLNaSJQqm1v4x4FcjPwKfAzqynoTdwjfG4Ffof76VZl1jp/pWliuBX3Q4q+EZEtuJ+d0DDe/qXA6uD+CEq1TUcfVaqDRCTOuJ9e1Rv3WcLFxpgSawz5tdZ0a53EDdt4C5hnTv5sbaWCRq8aUqrj3rMeIBIJPGKdKWCMqRKRX+J+ruy3ra1sPXDnHU0Cym56RqCUUiFO+wiUUirEaSJQSqkQp4lAKaVCnCYCpZQKcZoIlFIqxP1/g7ctsx6uEigAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "#accuracy_s1 =np.array(accuracy_s).reshape(len(C_s),len(gamma_s))\n",
    "Otto_SVM_result = pd.read_csv(dpath +\"Otto_SVM_result.csv\")\n",
    "accuracy_s1 = Otto_SVM_result['accuracy']\n",
    "\n",
    "C_s = np.logspace(-1, 3, 5)# logspace(a,b,N)把10的a次方到10的b次方区间分成N份 \n",
    "gamma_s = np.logspace(-1, 1, 3)  \n",
    "accuracy_s1 =np.array(accuracy_s1).reshape(len(C_s),len(gamma_s))\n",
    "\n",
    "x_axis = np.log10(C_s)\n",
    "for j, gamma in enumerate(gamma_s):\n",
    "    plt.plot(x_axis, np.array(accuracy_s1[:,j]), label = ' Test - log(gamma)' + str(np.log10(gamma)))\n",
    "\n",
    "plt.legend()\n",
    "plt.xlabel( 'log(C)' )                                                                                                      \n",
    "plt.ylabel( 'accuracy' )\n",
    "plt.savefig(dpath +'RBF_SVM_Otto.png' )\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.0\n",
      "1.0\n"
     ]
    }
   ],
   "source": [
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(accuracy_s1, axis=None), accuracy_s1.shape)\n",
    "Best_C = C_s[ index[0] ]\n",
    "Best_gamma = gamma_s[ index[1] ]\n",
    "\n",
    "print(Best_C)\n",
    "print(Best_gamma)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 找到最佳参数后，用全体训练数据训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# SVC训练SVC，支持概率输出\n",
    "Best_C = 10\n",
    "Best_gamma = 1.0\n",
    "\n",
    "SVC4 =  SVC( C = Best_C, kernel='rbf', gamma = Best_gamma, probability=True)\n",
    "SVC4.fit(X_train, y_train)\n",
    "\n",
    "#保持模型，用于后续测试\n",
    "import _pickle as cPickle\n",
    "cPickle.dump(SVC4, open(dpath +\"Otto_RBF_SVC.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
